Introduction

For this project, I wanted to map the number of police use of force cases across county jurisdictions for the state of California. I will be using data obtained from the California Department of Justice’s (DOJ) Use of Force dataset, obtained from https://openjustice.doj.ca.gov/data. The California DOJ provides open access data on the criminal justice system.

The Use of Force dataset incluces data from 2016-2018 of incidents that resulted in serious bodily injury or death or involved the discharge of a firearm, as reported by Law Enforcement Agencies and other entities throughout the state that employ peace officers.

Load libraries

library(readr)
library(ggplot2)
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(maps)
library(mapdata)
library(tidyverse)
## ── Attaching packages ──────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.0     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ purrr   0.3.4     ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::map()    masks maps::map()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(viridis)
## Loading required package: viridisLite
library(flexdashboard)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
## 
##     wind
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(knitr)
library(hms)
## 
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
## 
##     hms

Import and Cleanup Data

I began by importing the data from the California DOJ website.

library(readr)
Incident_2018 <- read_csv(url("https://data-openjustice.doj.ca.gov/sites/default/files/dataset/2019-07/URSUS_Incident_2018.csv"))
## Parsed with column specification:
## cols(
##   INCIDENT_ID = col_character(),
##   ORI = col_character(),
##   INCIDENT_DATE_STR = col_character(),
##   INCIDENT_TIME_STR = col_time(format = ""),
##   CITY = col_character(),
##   COUNTY = col_character(),
##   STATE = col_character(),
##   ZIP_CODE = col_double(),
##   MULTIPLE_LOCATIONS = col_logical(),
##   ON_K12_CAMPUS = col_logical(),
##   ARREST_MADE = col_logical(),
##   CRIME_REPORT_FILED = col_logical(),
##   CONTACT_REASON = col_character(),
##   IN_CUSTODY_REASON = col_character(),
##   NUM_INVOLVED_CIVILIANS = col_double(),
##   NUM_INVOLVED_OFFICERS = col_double(),
##   NUM_INVOLVED_AGENCIES = col_double()
## )
Incident_2017 <- read_csv(url("https://data-openjustice.doj.ca.gov/sites/default/files/dataset/2018-08/URSUS_Incident_2017.csv"))
## Parsed with column specification:
## cols(
##   INCIDENT_ID = col_character(),
##   ORI = col_character(),
##   INCIDENT_DATE_STR = col_character(),
##   INCIDENT_TIME_STR = col_time(format = ""),
##   CITY = col_character(),
##   COUNTY = col_character(),
##   STATE = col_character(),
##   ZIP_CODE = col_double(),
##   MULTIPLE_LOCATIONS = col_logical(),
##   ON_K12_CAMPUS = col_logical(),
##   ARREST_MADE = col_logical(),
##   CRIME_REPORT_FILED = col_logical(),
##   CONTACT_REASON = col_character(),
##   IN_CUSTODY_REASON = col_character(),
##   NUM_INVOLVED_CIVILIANS = col_double(),
##   NUM_INVOLVED_OFFICERS = col_double(),
##   PRIMARY_AGENCY_INDICATOR = col_character()
## )
Incident_2016 <- read_csv(url("https://data-openjustice.doj.ca.gov/sites/default/files/dataset/2018-08/URSUS_Incident_2016.csv"))
## Parsed with column specification:
## cols(
##   Incident_ID = col_character(),
##   ORI = col_character(),
##   Incident_Date_Str = col_character(),
##   Incident_Time_Str = col_double(),
##   City = col_character(),
##   County = col_character(),
##   State = col_character(),
##   Zip_Code = col_double(),
##   Multiple_Locations = col_logical(),
##   On_K12_Campus = col_logical(),
##   Arrest_Made = col_logical(),
##   Crime_Report_Filed = col_logical(),
##   Contact_Reason = col_character(),
##   In_Custody_Reason = col_character(),
##   Num_Involved_Civilians = col_double(),
##   Num_Involved_Officers = col_double()
## )

The data consists of several flat files including information on each individual incident, each person - officer and civilian - invovled in the incident and each agency involved in the incident. These flat files are related along the key, Incident_ID, which is the identifier used to represent each unique incident or case, or the key, ORI, which is the number of the reporting agency.

view(Incident_2016)
view(Incident_2017)
view(Incident_2018)

To bind the data, I used the row_bind function in the dplyr package. To merge the 2016, 2017 and 2018 incident datasets, I needed to first ensure that all the variable names and variable types were identical. This required cleaning up the 2016 dataset by renaming variables and converting the INCIDENT_TIME_STR variable to a time variable, to match with the 2017 and 2018 datasets.

Incident_2016 <- rename(Incident_2016,
                        "INCIDENT_ID" = "Incident_ID",
                        "ORI" = "ORI",
                        "INCIDENT_DATE_STR" = "Incident_Date_Str",
                        "INCIDENT_TIME_STR" = "Incident_Time_Str",
                        "CITY" = "City",
                        "COUNTY" = "County",
                        "STATE" = "State",
                        "ZIP CODE" = "Zip_Code",
                        "MULTIPLE_LOCATIONS" = "Multiple_Locations",
                        "ON_K12_CAMPUS" = "On_K12_Campus",
                        "ARREST_MADE" = "Arrest_Made",
                        "CRIME_REPORT_FILED" = "Crime_Report_Filed",
                        "CONTACT_REASON" = "Contact_Reason",
                        "IN_CUSTODY_REASON" = "In_Custody_Reason",
                        "NUM_INVOLVED_CIVILIANS" = "Num_Involved_Civilians",
                        "NUM_INVOLVED_OFFICERS" = "Num_Involved_Officers")

Incident_2016 <- Incident_2016 %>%
        mutate(INCIDENT_TIME_STR = as.hms(Incident_2016$INCIDENT_TIME_STR))
## Warning: as.hms() is deprecated, please use as_hms().
## This warning is displayed once per session.

After cleaning up the 2016 dataset, I used the row_bind function to combine the datasets into a single tibble.

Incident_data <- Incident_2018 %>%
        bind_rows(Incident_2017)

Incident_data <- Incident_data %>%
        bind_rows(Incident_2016)

Now that the incident data is combined, let’s explore!

Exploratory data analysis

The first thing that I’m interested in is the total number of police use of force cases that resulted in serious bodily injury or death.

To begin, I need to designate the year in the mmddyyy values of the INCIDENT_DATE_STR variable.

Incident_data <- Incident_data %>%
        mutate(INCIDENT_DATE_STR = mdy(INCIDENT_DATE_STR))

Incident_data <- Incident_data %>%
        tibble(date = INCIDENT_DATE_STR,
               year = year(INCIDENT_DATE_STR),
               month = month(INCIDENT_DATE_STR),
               day = day(INCIDENT_DATE_STR))

Since every case represents a single incident, I can use the following function.

Incident_data %>%
        count(year)
## # A tibble: 3 x 2
##    year     n
##   <dbl> <int>
## 1  2016   782
## 2  2017   724
## 3  2018   630
ggplot(Incident_data) +
        geom_bar(mapping = aes(x = year), fill = "blue")

The data shows that there has been a gradual decline in the number of police use of force incidents that result in serious bodily injury or death across the state of California.

The California DOJ also notes that 2016 was the first year that agencies reported these data, and since not every law enforcement agency had the capacity to report these data, the 2016 dataset was likely less complete than latter years. This suggests that the incident count for 2016 was likely higher than what we see in the above graph, which suggests that these specific use of force incidents in California have seen an even greater reduction over the last few years.

But while these incidents have decreased statewide, this may not be true across all counties in the state. That is, this statewide decline in police use of force cases that result in serious bodily injury or death may mask increases in incidents in specific jurisdictions.

Given this, I segmented the data by individual counties to explore whether specific counties show reverse trends.

To do this, I first cleaned up the values in the COUNTY variable by using the recode function.

Incident_data <- Incident_data %>%
    mutate(COUNTY = recode(COUNTY, 
                           "ALAMEDA" = "Alameda County",
                           "BUTTE" = "Butte County",
                           "CALAVERAS" = "Calaveras County",
                           "CONTRA COSTA" = "Contra Costa County",
                           "DEL NORTE" = "Del Norte County",
                           "EL DORADO" = "El Dorado County",
                           "FRESNO" = "Fresno County",
                           "GLENN" = "Glenn County",
                           "HUMBOLDT" = "Humboldt County",
                           "IMPERIAL" = "Imperial County",
                           "KERN" = "Kern County",
                           "KINGS" = "Kings County",
                           "LAKE" = "Lake County",
                           "LASSEN" = "Lassen County",
                           "LOS ANGELES" = "Los Angeles County",
                           "MADERA" = "Madera County",
                           "MARIN" = "Marin County",
                           "MARIPOSA" = "Mariposa County",
                           "MENDOCINO" = "Mendocino County",
                           "MERCED" = "Merced County",
                           "MONTEREY" = "Monterey County",
                           "NAPA" = "Napa County",
                           "NEVADA" = "Nevada County",
                           "ORANGE" = "Orange County",
                           "PLACER" = "Placer County",
                           "PLUMAS" = "Plumas County",
                           "RIVERSIDE" = "Riverside County",
                           "SACRAMENTO" = "Sacramento County",
                           "SAN BERNARDINO" = "San Bernardino County",
                           "SAN DIEGO" = "San Diego County",
                           "SAN FRANCISCO" = "San Francisco County",
                           "SAN JOAQUIN" = "San Joaquin County",
                           "SAN LUIS OBISPO" = "San Luis Obispo County",
                           "SAN MATEO" = "San Mateo County",
                           "SANTA BARBARA" = "Santa Barbara County",
                           "SANTA CLARA" = "Santa Clara County",
                           "SANTA CRUZ" = "Santa Cruz County",
                           "SHASTA" = "Shasta County",
                           "SISKIYOU" = "Siskiyou County",
                           "SOLANO" = "Solano County",
                           "SONOMA" = "Sonoma County",
                           "STANISLAUS" = "Stanislaus County",
                           "SUTTER" = "Sutter County",
                           "TEHAMA" = "Tehama County",
                           "TRINITY" = "Trinity County",
                           "TULARE" = "Tulare County",
                           "TUOLUMNE" = "Tuolumne County",
                           "VENTURA" = "Ventura County",
                           "YOLO" = "Yolo County",
                           "YUBA" = "Yuba County"))

I then summarized the data with a table and graph.

Incident_data %>%
        group_by(year, COUNTY) %>%
        summarize(n = n()) %>%
        spread(year, n) %>%
        kable()
COUNTY 2016 2017 2018
Alameda County 20 38 26
Butte County 4 3 4
Calaveras County 2 1 1
Contra Costa County 27 23 20
Del Norte County 1 2 2
El Dorado County 1 NA 1
Fresno County 26 13 13
Glenn County 2 3 1
Humboldt County 6 4 2
Imperial County 2 NA 1
Inyo County 1 NA NA
Kern County 36 39 24
Kings County 3 4 2
Lake County 1 2 3
Lassen County 1 1 NA
Los Angeles County 212 196 174
Madera County 3 3 3
Marin County NA 1 1
Mariposa County 1 1 NA
Mendocino County 2 2 4
Merced County 2 7 4
Modoc County 2 NA NA
Monterey County 2 5 1
Napa County 4 3 3
Nevada County 1 NA 1
Orange County 61 31 41
Placer County 6 7 7
Plumas County 1 1 NA
Riverside County 60 40 45
Sacramento County 20 27 18
San Benito County 1 NA NA
San Bernardino County 71 62 70
San Diego County 49 67 48
San Francisco County 6 8 12
San Joaquin County 23 21 7
San Luis Obispo County 2 3 1
San Mateo County 15 12 7
Santa Barbara County 15 8 9
Santa Clara County 24 26 20
Santa Cruz County 4 3 2
Shasta County 3 3 5
Siskiyou County 2 1 2
Solano County 9 13 8
Sonoma County 6 8 4
Stanislaus County 11 4 5
Sutter County 2 3 1
Tehama County 2 4 8
Trinity County NA 1 NA
Tulare County 14 8 8
Tuolumne County NA 1 NA
Ventura County 7 4 8
Yolo County 1 2 1
Yuba County 5 5 2
ggplot(Incident_data) +
        geom_bar(mapping = aes(x = year)) +
        facet_wrap(~ COUNTY)

From a visual observation, it seems as if the declining trend in use of force incidents that result in serious bodily injury or death has decreased in the majority of counties in California. However, there are 58 counties in California, which makes the county breakdown a bit difficult to see. To make it easier, I’ll focus on only the top 15 counties in the state. There are several counties with less than a handful of incidents, and the top 15 counties make up about 85 percent of all incidents in the state. Therefore, trends in these jurisdictions will have the greatest impact on state-wide trends.

county_sort <- Incident_data %>%
        count(COUNTY, sort = TRUE) %>%
        head(15)

county_sort <- Incident_data %>%
        filter(COUNTY %in% county_sort$COUNTY)

ggplot(county_sort) +
        geom_bar(mapping = aes(x = year)) +
        facet_wrap(~ COUNTY)

In the majority of the top 15 counties in California, the declining trend in police use of force incidents that result in serious bodily injury or death holds. While some counties, such as Alameda County and San Diego County, showed a slight uptick from 2016 to 2017, they each showed demonstrable declining trends from 2017 to 2018. Other counties, such as Orange County, Riverside County and San Bernardino County, show a slight uptick in incidents from 2017 to 2018, but overall they remain quite steady across all three years.

Mapping Use of Force Cases in California

Let’s go ahead and visually represent these use of force cases on a map of California.

To begin, let’s use map_data to subset California counties.

counties <- map_data("county")

ca_counties <- counties %>%
        filter(region == "california")

Next, I want to make a vector map of California counties. This vector map will be used to plot the county data into their respective counties.

ca_base <- ggplot(data = ca_counties, mapping = aes(x = long, y = lat, group = group)) +
        coord_quickmap() +
        geom_polygon(color = "black", fill = "grey")

ca_base

I’m not a fan of the way this looks. So I’ll use theme_void() to remove the access grids and grey background.

ca_base <- ca_base + theme_void() +
        geom_polygon(data = ca_counties, fill = NA, color = "white") +
        geom_polygon(color = "white", fill = NA)

ca_base

Before I join the Incident_data with the ca_base, I want to (1) create a tibble with the county values for all California counties, (2) recode the county names to match those in the ca_base data and (3) ensure the column names are lowercase.

forcebycounty <- Incident_data %>%
        count(COUNTY)

forcebycounty <- forcebycounty %>%
    mutate(COUNTY = recode(COUNTY, 
                           "Alameda County" = "alameda",
                           "Butte County" = "butte",
                           "Calaveras County" = "calaveras",
                           "Contra Costa County" = "contra costa",
                           "Del Norte County" = "del norte",
                           "El Dorado County" = "el dorado",
                           "Fresno County" = "fresno",
                           "Glenn County" = "glenn",
                           "Humboldt County" = "humboldt",
                           "Imperial County" = "imperial",
                           "Kern County" = "kern",
                           "Kings County" = "kings",
                           "Lake County" = "lake",
                           "Lassen County" = "lassen",
                           "Los Angeles County" = "los angeles",
                           "Madera County" = "madera",
                           "Marin County" = "marin",
                           "Mariposa County" = "mariposa",
                           "Mendocino County" = "mendocino",
                           "Merced County" = "merced",
                           "Monterey County" = "monterey",
                           "Napa County" = "napa",
                           "Nevada County" = "nevada",
                           "Orange County" = "orange",
                           "Placer County" = "placer",
                           "Plumas County" = "plumas",
                           "Riverside County" = "riverside",
                           "Sacramento County" = "sacramento",
                           "San Bernardino County" = "san bernardino",
                           "San Diego County" = "san diego",
                           "San Francisco County" = "san francisco",
                           "San Joaquin County" = "san joaquin",
                           "San Luis Obispo County" = "san luis obispo",
                           "San Mateo County" = "san mateo",
                           "Santa Barbara County" = "santa barbara",
                           "Santa Clara County" = "santa clara",
                           "Santa Cruz County" = "santa cruz",
                           "Shasta County" = "shasta",
                           "Siskiyou County" = "siskiyou",
                           "Solano County" = "solano",
                           "Sonoma County" = "sonoma",
                           "Stanislaus County" = "stanislaus",
                           "Sutter County" = "sutter",
                           "Tehama County" = "tehama",
                           "Trinity County" = "trinity",
                           "Tulare County" = "tulare",
                           "Tuolumne County" = "tuolumne",
                           "Ventura County" = "ventura",
                           "Yolo County" = "yolo",
                           "Yuba County" = "yuba"))
forcebycounty <- mutate_each(forcebycounty, funs(tolower))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
forcebycounty
## # A tibble: 53 x 2
##    COUNTY       n    
##    <chr>        <chr>
##  1 alameda      84   
##  2 butte        11   
##  3 calaveras    4    
##  4 contra costa 70   
##  5 del norte    5    
##  6 el dorado    2    
##  7 fresno       52   
##  8 glenn        6    
##  9 humboldt     12   
## 10 imperial     3    
## # … with 43 more rows

Now, it’s time to join the two datasets using dplyr’s left_join function.

caforce <- left_join(ca_counties, forcebycounty, by = c("subregion" = "COUNTY"))

caforce <- tibble(caforce)
caforce <- caforce %>%
        mutate(n = as.numeric(n))
caforce
## # A tibble: 2,977 x 7
##     long   lat group order region     subregion     n
##    <dbl> <dbl> <dbl> <int> <chr>      <chr>     <dbl>
##  1 -121.  37.5   157  6965 california alameda      84
##  2 -122.  37.5   157  6966 california alameda      84
##  3 -122.  37.5   157  6967 california alameda      84
##  4 -122.  37.5   157  6968 california alameda      84
##  5 -122.  37.5   157  6969 california alameda      84
##  6 -122.  37.5   157  6970 california alameda      84
##  7 -122.  37.5   157  6971 california alameda      84
##  8 -122.  37.5   157  6972 california alameda      84
##  9 -122.  37.5   157  6973 california alameda      84
## 10 -122.  37.5   157  6974 california alameda      84
## # … with 2,967 more rows

Map the data

hotforce <- ca_base +
        geom_polygon(data = caforce, aes(fill = n), color = "white") + 
        geom_polygon(color = "black", fill = NA) +
        theme_void()
hotforce

We may want to adjust the color settings to make the graphic more accessible for individuals who suffer from color blindness. We can do this with the viridis package.

hotforce <- hotforce +
        scale_fill_viridis(breaks = c(100, 200, 300, 400, 500))

hotforce

We can also make this graph interactive using the plotly package.

ggplotly(hotforce)